sink("log/numerical_example_log.txt", append = FALSE, type = "output")


## You can install the development version of dbw from GitHub:
devtools::install_github("hirotokatsumata/dbw")

## Load packages
library(dbw)
library(tidyverse)

## Double check that you did not load Hmisc package
detach(package:Hmisc,unload=TRUE)
# You will get an error message but it is okay


## Logistic function
logistic <- function (x) {c(exp(x) / (1 + exp(x)))}

## ggplot2 settings
num_ex_classics <- theme_classic() +  
  theme(axis.title = element_text(size = 10.4),
        axis.text = element_text(size = 11),
        plot.title = element_text(hjust = 0.5, size = 12),
        legend.title = element_text(size = 0),
        legend.text = element_text(size = 10),
        legend.background = element_rect(fill = NA, colour = NA))

## Generate a dataset
set.seed(20230501)
n <- 2000
df <- data.frame(x = rnorm(n, 0, 1)) %>%
			mutate(trueps = logistic((x + 1)^2 / 2 - 1.5),
						 treat = rbinom(n = n, size = 1, prob = trueps),
						 trueweight = treat / trueps + (1 - treat) / (1 - trueps))
n1 <- sum(df$treat)
n0 <- sum(1 - df$treat)

## Estimation
result_ndbw <- dbw(formula_y = trueps ~ 1, 
                   formula_ps = treat ~ x, 
                   estimand = "AO", 
                   method = "dbw", 
                   method_y = "wls", 
                   data = df, 
                   normalize = TRUE,
                   vcov = FALSE,
                   lambda = 0)
result_mle <- dbw(formula_y = trueps ~ 1, 
                  formula_ps = treat ~ x, 
                  estimand = "AO", 
                  method = "mle", 
                  method_y = "wls", 
                  data = df, 
                  normalize = FALSE,
                  vcov = FALSE,
                  lambda = 0)

result <- data.frame(df, 
                     weightndbw = result_ndbw$est_weights * n, 
                     weightmle = result_mle$est_weights * n, 
                     psndbw = result_ndbw$ps, 
                     psmle = result_mle$ps) %>%
          filter(treat == 1)


## Values mentioned in Section 2.1
## RMSE of the weights
result %>% mutate(rmse_w_ndbw = (weightndbw - trueweight)^2,
                  rmse_w_mle = (weightmle - trueweight)^2) %>%
           summarize(rmse_w_ndbw = sqrt(mean(rmse_w_ndbw)),
                     rmse_w_mle = sqrt(mean(rmse_w_mle)))

## RMSE of the PS
result %>% mutate(rmse_ps_ndbw = (psndbw - trueps)^2,
									rmse_ps_mle = (psmle - trueps)^2) %>%
					 summarize(rmse_ps_ndbw = sqrt(mean(rmse_ps_ndbw)),
						 				 rmse_ps_mle = sqrt(mean(rmse_ps_mle)))


## Figure B.1 (Online Supplementary Materials B)
## (a-1) Covariate distribution balance
## Cumulative distribution functions
result %>%
arrange(x) %>%
mutate(wecdf_ndbw = cumsum(weightndbw) / sum(weightndbw),
			 wecdf_mle = cumsum(weightmle) / sum(weightmle),
			 wecdf_true = cumsum(trueweight) / sum(trueweight)) %>%
ggplot(., aes(x = x, y = wecdf_ndbw)) +
	geom_path(linetype = "dashed") +
	geom_path(aes(x = x, y = wecdf_true), inherit.aes = FALSE) +
	geom_path(aes(x = x, y = wecdf_mle), inherit.aes = FALSE, linetype = "dotted") +
	geom_hline(yintercept = 1) +
  xlab("X") +
  ylab("Empirical cumulative distribution functions") +
	num_ex_classics
ggsave("figures/fig_b1.pdf", width = 13, height = 9, units = "cm")
#ggsave("figures/num_ex_cdf_t.pdf", width = 13, height = 9, units = "cm")


## Values mentioned in Section 2 (a-1 and b-1)
## (a-1) KS statistic
result %>%
arrange(x) %>%
mutate(wecdf_ndbw = cumsum(weightndbw) / sum(weightndbw),
			 wecdf_mle = cumsum(weightmle) / sum(weightmle),
			 wecdf_true = cumsum(trueweight) / sum(trueweight)) %>%
summarize(KS_ndbw = max(abs(wecdf_true - wecdf_ndbw)), 
					KS_mle = max(abs(wecdf_true - wecdf_mle)))

## (b-1) Relative errors of weights
result %>% mutate(rel_error_ndbw = (weightndbw / trueweight - 1)^2,
									rel_error_mle = (weightmle / trueweight - 1)^2) %>%
					 summarize(relative_error_ndbw = mean(rel_error_ndbw),
						 				 relative_error_mle = mean(rel_error_mle))


## Figures
maxy_w <- max(c(result$weightndbw, result$weightmle))
max_w <- max(c(maxy_w, result$trueweight)) + 1
df_poly_w <- data.frame(x = c(0, 2, max_w, max_w, max_w - 2, 0),
												y = c(0, 0, max_w - 2, max_w, max_w, 2))
df_poly_ps_t <- data.frame(x = c(seq(0, 1, length.out = 50), 1, seq(1/3, 0, length.out = 50)),
													 y = c(seq(0, 1, length.out = 50) / (1 + 2 * seq(0, 1, length.out = 50)), 1, 
													 				seq(1/3, 0, length.out = 50) / (1 - 2 * seq(1/3, 0, length.out = 50))))

## Figure 2 (left panel)
## True vs estimated weights
result %>% 
pivot_longer(cols = c(weightndbw, weightmle), names_to = "estimator", values_to = "estweight") %>%
ggplot(., aes(x = trueweight, y = estweight, shape = estimator, size = estimator)) +
  geom_point(stroke = 0.5, alpha = 0.4) +
	geom_abline(intercept = 0, slope = 1, linewidth = 0.5, alpha = 0.5) +
	geom_abline(intercept = 2, slope = 1, linewidth = 0.3, alpha = 0.2, linetype = "dashed") +
	geom_abline(intercept = -2, slope = 1, linewidth = 0.3, alpha = 0.2, linetype = "dashed") +
	geom_polygon(data = df_poly_w, aes(x = x, y = y), inherit.aes = FALSE, fill = "black", alpha = 0.08) +
  xlab("True inverse probability weights") +
  ylab("Estimated inverse probability weights") +
  ggtitle("Weights for treated units") +
  scale_shape(name = "shape_size", solid = FALSE, 
              labels = c("weightndbw" = " Proposed", "weightmle" = " MLE")) +
  scale_size_manual(name = "shape_size", values = c(1.1, 1.0), 
                    labels = c("weightndbw" = " Proposed", "weightmle" = " MLE")) +
  scale_x_continuous(expand = c(0, 0), limit = c(0, max_w)) +
  scale_y_continuous(expand = c(0, 0), limit = c(0, max_w)) +
  coord_fixed() +
	num_ex_classics +
	theme(legend.position = c(0.8, 0.3))
ggsave("figures/fig_2_left.pdf", width = 8, height = 8, units = "cm")
#ggsave("figures/num_ex_weights_t.pdf", width = 8, height = 8, units = "cm")

## Figure 2 (right panel)
## True vs estimated PSs
result %>% 
pivot_longer(cols = c(psndbw, psmle), names_to = "estimator", values_to = "estps") %>%
ggplot(., aes(x = trueps, y = estps, shape = estimator, size = estimator)) +
  geom_point(stroke = 0.5, alpha = 0.4) +
	geom_abline(intercept = 0, slope = 1, linewidth = 0.5, alpha = 0.5) +
	geom_polygon(data = df_poly_ps_t, aes(x = x, y = y), inherit.aes = FALSE, fill = "black", alpha = 0.08, 
								colour = "grey80",linetype = "dashed") + 
  xlab("True propensity scores") +
  ylab("Estimated propensity scores") +
  ggtitle("Propensity scores for treated units") +
  scale_shape(name = "shape_size", solid = FALSE, 
              labels = c("psndbw" = " Proposed", "psmle" = " MLE")) +
  scale_size_manual(name = "shape_size", values = c(1.1, 1.0), 
                    labels = c("psndbw" = " Proposed", "psmle" = " MLE")) +
  scale_x_continuous(expand = c(0, 0), limit = c(0, 1)) +
  scale_y_continuous(expand = c(0, 0), limit = c(0, 1)) +
  coord_fixed() +
	num_ex_classics +
	theme(legend.position = c(0.2, 0.8),
				panel.border = element_rect(colour = "black", fill = NA, linewidth = 1),
				plot.margin = unit(c(5.5, 7.5, 5.5, 5.5), "pt"))
ggsave("figures/fig_2_right.pdf", width = 8.1, height = 8, units = "cm")
#ggsave("figures/num_ex_ps_t.pdf", width = 8.1, height = 8, units = "cm")


## Non-normalized distribution balancing weighting
result_ndbw_nn <- dbw(formula_y = trueps ~ 1, 
                      formula_ps = treat ~ x, 
                      estimand = "AO", 
                      method = "dbw", 
                      method_y = "wls", 
                      data = df, 
                      normalize = FALSE,
                      vcov = FALSE,
                      lambda = 0)

result_nn <- data.frame(df, 
                        weightndbw_nn = result_ndbw_nn$est_weights * n, 
                        psndbw_nn = result_ndbw_nn$ps) %>%
             filter(treat == 1)

## Values mentioned in Section 4.1
## RMSE of the weights
result_nn %>% mutate(rmse_w_ndbw_nn = (weightndbw_nn - trueweight)^2) %>%
              summarize(rmse_w_ndbw_nn = sqrt(mean(rmse_w_ndbw_nn)))

## RMSE of the PS
result_nn %>% mutate(rmse_ps_ndbw_nn = (psndbw_nn - trueps)^2) %>%
              summarize(rmse_ps_ndbw_nn = sqrt(mean(rmse_ps_ndbw_nn)))


## Distribution balancing weighting with regularization
rmse_w_lambda <- function (lambda) {
  res_std_comp <- std_comp(formula_y = trueps ~ 1, 
                           formula_ps = treat ~ x, 
                           estimand = "AO", 
                           method_y = "wls", 
                           data = df, 
                           std = TRUE,
                           weights = NULL)
  result_ndbw <- dbw(formula_y = trueps ~ 1, 
                     formula_ps = res_std_comp$formula_ps, 
                     estimand = "AO", 
                     method = "dbw", 
                     method_y = "wls", 
                     data = res_std_comp$data, 
                     normalize = TRUE,
                     vcov = FALSE,
                     lambda = lambda)
  result <- data.frame(df, 
                       weightndbw = result_ndbw$est_weights * n) %>%
            filter(treat == 1)
  result %>% mutate(rmse_w_ndbw = (weightndbw - trueweight)^2) %>%
             summarize(rmse_w_ndbw = sqrt(mean(rmse_w_ndbw))) %>%
             data.frame(.)
}

plot_regular <- function (lambda) {
  result <- numeric(length(lambda))
  for (i in 1:length(lambda)) {
    result[i] <- rmse_w_lambda(lambda = lambda[i])$rmse_w_ndbw
  }
  data.frame(lambda, result)
}

## RMSE of the weights with n / n_1
rmse_w0 <- sqrt(mean((result$treat / mean(df$treat) - result$trueweight)^2))

## Estimation with different lambda
result_regular <- plot_regular(lambda = c(seq(0, 0.5, by = 0.01), seq(0.6, 5, by = 0.1)))
lambda_optimal <- result_regular$lambda[order(result_regular$result)][1]
lambda_optimal


## Figure 3
## Plot the results
ggplot(result_regular, aes(x = lambda, y = result)) +
  geom_line(linewidth = 0.7) +
  geom_vline(xintercept = lambda_optimal, linewidth = 0.5, linetype = "dotted") +
  geom_hline(yintercept = rmse_w0, linewidth = 0.5, linetype = "dotted") +
  scale_x_continuous(expand = c(0, 0), limit = c(0, 5.01)) +
  scale_y_continuous(expand = c(0, 0), limit = c(0, 1.52)) +
  xlab("Regularization hyper-parameter (lambda)") +
  ylab("RMSE of the weights") +
  num_ex_classics
ggsave("figures/fig_3.pdf", width = 10, height = 7.5, units = "cm")
#ggsave("figures/regularization.pdf", width = 10, height = 7.5, units = "cm")


## Figure C.1 (Online Supplementary Materials C)
## Comparison of the upper bound of bias for PS between the proposed one and Tan (2020)'s.
a <- seq(0.1, 1, by = 0.01)
ub_proposed <- (1 - a)^2 / (a * (1 - a + a * log(a)))
ub_proposed[length(a)] <- 2
ub_tan <- 5 / (3 * a)
ub_tan[a > 1 / 2] <- NA
res <- data.frame(a = rep(a, 2), 
                  ub = c(ub_proposed, ub_tan),
                  type = rep(c("p", "t"), each = length(a)))
ggplot(res, aes(x = a, y = ub, group = type, linetype = type)) +
  geom_line(linewidth = 0.7) +
  geom_vline(xintercept = 0.5, linewidth = 0.5, linetype = "dotted") +
  geom_vline(xintercept = 1, linewidth = 1, linetype = "solid") +
  annotate("text", x = 0.5, y = 1.6, label = expression(paste("Proposed evaluation (", {0 < "a"} <= 1, ")"))) +
  annotate("text", x = 0.4, y = 12, label = expression(paste("Tan (2020)'s evaluation (", {0 < "a"} <= 0.5, ")"))) +
  scale_x_continuous(expand = c(0, 0), limit = c(0.1, 1), breaks = c(0.1, 0.5, 1.0)) +
  scale_y_continuous(expand = c(0, 0), limit = c(0, NA)) +
  xlab("a") +
  ylab("Upper bound of the relative errors of the propensity scores  ") +
  theme_classic() +
  theme(legend.position = "none",
        axis.title = element_text(size = 10.4),
        axis.text = element_text(size = 11),
        plot.title = element_text(hjust = 0.5, size = 12),
        legend.title = element_text(size = 0),
        legend.text = element_text(size = 10),
        legend.background = element_rect(fill = NA, colour = NA),
        plot.margin = unit(c(5.5, 10, 5.5, 5.5), "points"))
ggsave("figures/fig_c1.pdf", width = 13, height = 10, units = "cm")
#ggsave("figures/comparison_ub.pdf", width = 13, height = 10, units = "cm")

sink()
